Here we calculate the intergenerational coefficients of variation of 22G-RNA normalized counts, and determine which genes show reduced intergenerational variance in 22G-RNA counts.
library(ggplot2)
library(MASS)
library(viridis)
## Loading required package: viridisLite
library(msir)
## Package 'msir' version 1.3.2
## Type 'citation("msir")' for citing this R package in publications.
overall_sd<-function(row){
sqsum=0
ncomps=0
for (i in seq(length(row))){
for (j in seq(length(row))){
if (i>j){
sqsum=sqsum+(row[j]-row[i])**2
ncomps=ncomps+1
}}}
return(sqrt(sqsum/ncomps))
}
#overall sd calculation but sampling only 10 pairs of data points randomly (to be able to fairly compare the distributions to those of the intergenerational variance which is calculated from 10 pairs of consecutive data points.)
overall_sd_tendatapoints<-function(row){
sqsum=vector()
ncomps=vector()
for (i in seq(length(row))){
for (j in seq(length(row))){
if (i>j){
sqsum=c(sqsum,(row[j]-row[i])**2)
}}}
sqsum_sampled=sample(sqsum,size=10, replace = FALSE, prob = NULL)
sqsum<-sum(sqsum_sampled)
return(sqrt(sqsum/10))
}
#intergenerational sd calculation
intergenerational_sd<-function(row){
sqsum=0
ncomps=0
for (i in seq(length(row)-1)){
if (i != 6){
sqsum=sqsum+(row[i+1]-row[i])**2
ncomps=ncomps+1
}}
return(sqrt(sqsum/ncomps))
}
#density
get_density <- function(x, y, ...) {
dens <- MASS::kde2d(x, y, ...)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix, iy)
return(dens$z[ii])
}
loess_fit_and_plotres<-function(df,x,y,thr_type,thr){
dat<-df
#dat<-dat[which(dat[,"log10_mean"]>0),]
l<-loess.sd(x = dat[,x],y = dat[,y], nsigma = 1.96)
l_fit<-data.frame(x=l$x,y=l$y,sd=l$sd,upper=l$upper,lower=l$lower,ID=dat$ID)
l_fit$Z<-(dat[,y]-l_fit$y)/l_fit$sd
l_fit$pvalue<-pnorm(l_fit$Z,lower.tail = FALSE)
l_fit$ID<-dat$ID
if (thr_type=="fdr"){
l_fit_padj<-l_fit[which(l_fit$x>0.9999999),]
l_fit_padj$padj<-p.adjust(l_fit_padj$pvalue,method = "fdr")
dat$sig<-rep(0,nrow(dat))
dat[which(dat$ID %in% l_fit_padj[which(l_fit_padj$padj<thr),"ID"]),"sig"]<-1
}
else if (thr_type=="pval"){
dat$sig<-rep(0,nrow(dat))
dat[which(dat$ID %in% l_fit[which(l_fit$pvalue<thr),"ID"]),"sig"]<-1
dat[which(dat$log10_mean<1),"sig"]<-0
}
dat$density<-get_density(dat[,x],dat[,y], n = 100)
print(ggplot(dat)+geom_point(aes(dat[,x],dat[,y],color=density))+ scale_color_viridis()+geom_line(aes(x=l_fit$x,y=l_fit$lower),color="pink",linetype="dashed")+geom_line(aes(x=l_fit$x,y=l_fit$y),color="pink")+geom_line(aes(x=l_fit$x,y=l_fit$upper),color="pink",linetype="dashed")+geom_point(data=subset(dat,sig==1),aes(dat[which(dat$sig==1),x],dat[which(dat$sig==1),y]),color="red")+geom_hline(yintercept = -0.6,linetype="dashed")+ylab(y)+xlab(x))
dat<-merge(dat,l_fit[,c("pvalue","ID","Z")],by="ID")
return(dat[which(dat$sig==1),])
}
setwd("/Users/ab6415/Bioinformatics/Analysis/EpiMALs_PAPER_ANALYSES/06_Figure4/")
ttg_DEseqnorm<-read.table("../02_Normalised_counts/22G_DEseqnorm_counts_averaged.txt")
ttg_A_DEseqnorm<-ttg_DEseqnorm[,1:12]
dat_ttgA<-data.frame(mean=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=mean),
sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=sd),
overall_sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=overall_sd),
inter_sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=intergenerational_sd),
subsampled_overall_sd=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN=overall_sd_tendatapoints),
ID=rownames(ttg_A_DEseqnorm),
max=apply(ttg_A_DEseqnorm,MARGIN = 1,FUN = max))
dat_ttgA<-dat_ttgA[which(dat_ttgA$mean>0),]
#calculate log10cv2 values
dat_ttgA$log10_overall_cv2<-log10((dat_ttgA$overall_sd/dat_ttgA$mean)**2)
dat_ttgA$log10_inter_cv2<-log10((dat_ttgA$inter_sd/dat_ttgA$mean)**2)
dat_ttgA$log10_overallsub_cv2<-log10((dat_ttgA$subsampled_overall_sd/dat_ttgA$mean)**2)
dat_ttgA$log10_mean<-log10(dat_ttgA$mean+1)
dat_ttgA$log10_cv2<-log10((dat_ttgA$sd/dat_ttgA$mean)**2)
dat_ttgA$ff<-(dat_ttgA$overall_sd**2)/dat_ttgA$mean
dat_ttgA<-dat_ttgA[order(dat_ttgA$log10_mean,decreasing = FALSE),]
#plot
dat_ttgA$density<-get_density(dat_ttgA$log10_mean,dat_ttgA$log10_overall_cv2, n = 100)
ggplot(dat_ttgA)+geom_point(aes(log10_mean,log10_overall_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2")
dat_ttgA_noInf<-dat_ttgA[-which(dat_ttgA$log10_overallsub_cv2== (-Inf)),]
dat_ttgA_noInf$density<-get_density(dat_ttgA_noInf$log10_mean,dat_ttgA_noInf$log10_overallsub_cv2, n = 100)
ggplot(dat_ttgA_noInf)+geom_point(aes(log10_mean,log10_overallsub_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2, subsampled")
## Warning: Removed 1 rows containing missing values (geom_point).
dat_ttgA$density<-get_density(dat_ttgA$log10_mean,dat_ttgA$log10_inter_cv2, n = 100)
ggplot(dat_ttgA)+geom_point(aes(log10_mean,log10_inter_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, intergenerational cv2")
ttgA_fdr0.2<-loess_fit_and_plotres(dat_ttgA,"log10_mean","log10_overall_cv2","fdr",0.2)
ttgA_pvalue0.01<-loess_fit_and_plotres(dat_ttgA,"log10_mean","log10_overall_cv2","pval",0.01)
ttg_B_DEseqnorm<-ttg_DEseqnorm[,13:24]
dat_ttgB<-data.frame(mean=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=mean),
sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=sd),
overall_sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=overall_sd),
inter_sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=intergenerational_sd),
subsampled_overall_sd=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN=overall_sd_tendatapoints),
ID=rownames(ttg_B_DEseqnorm),
max=apply(ttg_B_DEseqnorm,MARGIN = 1,FUN = max))
dat_ttgB<-dat_ttgB[which(dat_ttgB$mean>0),]
#calculate log10cv2 values
dat_ttgB$log10_overall_cv2<-log10((dat_ttgB$overall_sd/dat_ttgB$mean)**2)
dat_ttgB$log10_inter_cv2<-log10((dat_ttgB$inter_sd/dat_ttgB$mean)**2)
dat_ttgB$log10_overallsub_cv2<-log10((dat_ttgB$subsampled_overall_sd/dat_ttgB$mean)**2)
dat_ttgB$log10_mean<-log10(dat_ttgB$mean+1)
dat_ttgB$log10_cv2<-log10((dat_ttgB$sd/dat_ttgB$mean)**2)
dat_ttgB$ff<-(dat_ttgB$overall_sd**2)/dat_ttgB$mean
dat_ttgB<-dat_ttgB[order(dat_ttgB$log10_mean,decreasing = FALSE),]
#plot
dat_ttgB$density<-get_density(dat_ttgB$log10_mean,dat_ttgB$log10_overall_cv2, n = 100)
ggplot(dat_ttgB)+geom_point(aes(log10_mean,log10_overall_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2")
dat_ttgB_noInf<-dat_ttgB[-which(dat_ttgB$log10_overallsub_cv2== (-Inf)),]
dat_ttgB_noInf$density<-get_density(dat_ttgB_noInf$log10_mean,dat_ttgB_noInf$log10_overallsub_cv2, n = 100)
ggplot(dat_ttgB_noInf)+geom_point(aes(log10_mean,log10_overallsub_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, overall cv2, subsampled")
dat_ttgB$density<-get_density(dat_ttgB$log10_mean,dat_ttgB$log10_inter_cv2, n = 100)
ggplot(dat_ttgB)+geom_point(aes(log10_mean,log10_inter_cv2,color=density))+ scale_color_viridis()+ylim(-3,2)+ggtitle("lineage A, intergenerational cv2")
ttgB_fdr0.2<-loess_fit_and_plotres(dat_ttgB,"log10_mean","log10_overall_cv2","fdr",0.2)
ttgB_pvalue0.01<-loess_fit_and_plotres(dat_ttgB,"log10_mean","log10_overall_cv2","pval",0.01)
length(ttgA_fdr0.2$ID); length(ttgB_fdr0.2$ID)
## [1] 120
## [1] 74
length(intersect(ttgA_fdr0.2$ID,ttgB_fdr0.2$ID))
## [1] 15
length(ttgA_pvalue0.01$ID); length(ttgB_pvalue0.01$ID)
## [1] 198
## [1] 175
length(intersect(ttgA_pvalue0.01$ID,ttgB_pvalue0.01$ID))
## [1] 63
write.table(ttgA_fdr0.2$ID,file="HV22Gs_ttgA_fdr0.2.txt",quote = FALSE)
write.table(ttgB_fdr0.2$ID,file="HV22Gs_ttgB_fdr0.2.txt",quote = FALSE)
write.table(ttgA_pvalue0.01$ID,file="HV22Gs_ttgA_pval0.01.txt",quote = FALSE)
write.table(ttgB_pvalue0.01$ID,file="HV22Gs_ttgB_pval0.01.txt",quote = FALSE)
#get all permutations of 12 data points --> calculate cv2 using consecutive dps --> how does it compare to the actual cv2?
#permutations(n=12,r=12,v=1:12,repeats.allowed = FALSE)
#too many combinations and becomes computationally infeasible
n_perms_cv2<-function(row){
N<-100000
cv2s<-rep(0,N)
for (rep in seq(N)){
perm<-sample(row,size=12,replace=FALSE)
cv2s[rep]<-intergenerational_sd(perm)
}
obs_cv2<-intergenerational_sd(row)
p_value<-(length(which(cv2s<=obs_cv2))+1)/N
return(p_value)
}
n_perms_cv2_morevar<-function(row){
N<-100000
cv2s<-rep(0,N)
for (rep in seq(N)){
perm<-sample(row,size=12,replace=FALSE)
cv2s[rep]<-intergenerational_sd(perm)
}
obs_cv2<-intergenerational_sd(row)
p_value<-(length(which(cv2s>=obs_cv2))+1)/N
return(p_value)
}
#calculate pvals genome-wide (just ran it once with 10^5 reps and now I'm opening the saved file)
atleast_ten_ttgs_A<-dat_ttgA[which(dat_ttgA$mean>=10),"ID"]
atleast_ten_ttgs_B<-dat_ttgA[which(dat_ttgB$mean>=10),"ID"]
write.table(union(atleast_ten_ttgs_A,atleast_ten_ttgs_B),file="at_least_10_22Gs_in_atleast_onelin.txt")
#removed genes with <10 mean 22Gs to gain statistical power
#ttg_A_lessvar_p<-apply(ttg_A_DEseqnorm[which(rownames(ttg_A_DEseqnorm) %in% atleast_ten_ttgs_A),],MARGIN = 1,FUN=n_perms_cv2)
#ttg_B_lessvar_p<-apply(ttg_B_DEseqnorm[which(rownames(ttg_B_DEseqnorm) %in% atleast_ten_ttgs_B),],MARGIN = 1,FUN=n_perms_cv2)
#
# morevar_p<-apply(ttg_A_DEseqnorm[which(rownames(ttg_A_DEseqnorm) %in% atleast_ten_ttgs),],MARGIN = 1,FUN=n_perms_cv2_morevar)
# #morevar_padj<-p.adjust(morevar_p,method = "fdr")
#
#write.table(ttg_A_lessvar_p,file="ttgA_pvalues_lessvar_100000reps.txt",quote=FALSE)
#write.table(ttg_B_lessvar_p,file="ttgB_pvalues_lessvar_100000reps.txt",quote=FALSE)
#write.table(morevar_p,file="../gene_lists/pvalues_morevar_100000reps.txt",quote=FALSE)
#
# #open saved files
#
lessvar_p_A<-read.table("ttgA_pvalues_lessvar_100000reps.txt")
lessvar_df_A<-data.frame(pval=lessvar_p_A$x,padj=p.adjust(lessvar_p_A$x,method="fdr"),ID=rownames(lessvar_p_A))
lessvar_p_B<-read.table("ttgB_pvalues_lessvar_100000reps.txt")
lessvar_df_B<-data.frame(pval=lessvar_p_B$x,padj=p.adjust(lessvar_p_B$x,method="fdr"),ID=rownames(lessvar_p_B))
ggplot(lessvar_df_A)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("inter_vs_overall_linA_pvalhist.pdf",dpi=150)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_A)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("inter_vs_overall_linA_padjhist.pdf",dpi=150)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("inter_vs_overall_linB_pvalhist.pdf",dpi=150)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("inter_vs_overall_linB_padjhist.pdf",dpi=150)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(lessvar_df_A$padj<0.1))
## [1] 188
length(which(lessvar_df_B$padj<0.1))
## [1] 354
length(which(lessvar_df_A$padj<0.2))
## [1] 565
length(which(lessvar_df_B$padj<0.2))
## [1] 848
length(intersect(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"]))
## [1] 58
length(union(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"]))
## [1] 484
#188 in lineage A, 354 in lineage B, overlap 58
length(intersect(lessvar_df_A[which(lessvar_df_A$padj<0.2),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.2),"ID"]))
## [1] 203
length(union(lessvar_df_A[which(lessvar_df_A$padj<0.2),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.2),"ID"]))
## [1] 1210
#565 in lineage A, 848 in lineage B, overlap 203
write.table(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],file="inter_overall_genes_linA_FDR0.1.txt")
write.table(lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"],file="inter_overall_genes_linB_FDR0.1.txt")
write.table(union(lessvar_df_A[which(lessvar_df_A$padj<0.1),"ID"],lessvar_df_B[which(lessvar_df_B$padj<0.1),"ID"]),file="inter_overall_genes_linsAB_FDR0.1.txt")
write.table(lessvar_df_A[,"ID"],file="inter_overall_genes_linA_BACKGROUND_LIST.txt")
write.table(lessvar_df_B[,"ID"],file="inter_overall_genes_linB_BACKGROUND_LIST.txt")
removegenes<-read.table("../05_Figure3/removegenes.txt")
removegenes<-removegenes$x
lessvar_df_A_nobatch<-lessvar_df_A[-which(lessvar_df_A$ID %in% removegenes),]
lessvar_df_B_nobatch<-lessvar_df_B[-which(lessvar_df_B$ID %in% removegenes),]
ggplot(lessvar_df_A_nobatch)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("inter_vs_overall_linA_pvalhist_nobatch.pdf",dpi=150)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_A_nobatch)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("inter_vs_overall_linA_padjhist_nobatch.pdf",dpi=150)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B_nobatch)+geom_histogram(aes(x=pval))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("inter_vs_overall_linB_pvalhist_nobatch.pdf",dpi=150)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lessvar_df_B_nobatch)+geom_histogram(aes(x=padj))+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("inter_vs_overall_linB_padjhist_nobatch.pdf",dpi=150)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(lessvar_df_A_nobatch$padj<0.1))
## [1] 114
length(which(lessvar_df_B_nobatch$padj<0.1))
## [1] 237
length(which(lessvar_df_A_nobatch$padj<0.2))
## [1] 364
length(which(lessvar_df_B_nobatch$padj<0.2))
## [1] 606
length(intersect(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"]))
## [1] 30
length(union(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"]))
## [1] 321
#114 in lineage A, 237 in lineage B, overlap 30, union 321
length(intersect(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.2),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.2),"ID"]))
## [1] 97
length(union(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.2),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.2),"ID"]))
## [1] 873
#364 in lineage A, 606 in lineage B, overlap 97, union 873
write.table(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],file="inter_overall_genes_linA_FDR0.1_nobatch.txt")
write.table(lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"],file="inter_overall_genes_linB_FDR0.1_nobatch.txt")
write.table(union(lessvar_df_A_nobatch[which(lessvar_df_A_nobatch$padj<0.1),"ID"],lessvar_df_B_nobatch[which(lessvar_df_B_nobatch$padj<0.1),"ID"]),file="inter_overall_genes_linsAB_FDR0.1_nobatch.txt")
top20<-union(lessvar_df_A[order(lessvar_df_A$padj),"ID"][1:20],lessvar_df_B[order(lessvar_df_B$padj),"ID"][1:20])
for (gene in top20){
plot(1:12,as.numeric(ttg_DEseqnorm[gene,1:12]),main=paste(gene,"lineage A",sep=" "))
plot(13:24,as.numeric(ttg_DEseqnorm[gene,13:24]),main=paste(gene,"lineage B",sep=" "))
}